# Load necessary libraries
# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(scales) # For percentage formatting
# Load the dataset
file_path <- "/Users/palomacz/Documents/GitHub/QUAIL-Mex/data/Cleaned_Dataset_Screening_HWISE_PSS.csv"
df <- read.csv(file_path, stringsAsFactors = FALSE, na.strings = c("", "N/A", "NA", "pending"))
# Identify discrete variables: all columns starting with "HW" or "PSS"
discrete_columns <- grep("^HW|^PSS", names(df), value = TRUE)
# Convert all discrete variables to factors
df <- df %>% mutate(across(all_of(discrete_columns), as.factor))
# Identify categorical variables (columns ending in "CAT") and convert them to factors
cat_columns <- grep("CAT$", names(df), value = TRUE)
df <- df %>% mutate(across(all_of(cat_columns), as.factor))
# Identify numeric variables (excluding ID, HW_TOTAL, and discrete variables)
numeric_columns <- df %>%
select(-all_of("ID")) %>%
select(-all_of(discrete_columns)) %>%
select(where(is.numeric)) %>%
names()
# Ensure HW_TOTAL is numeric
df$HW_TOTAL <- as.numeric(df$HW_TOTAL)
# Define a categorical variable for HW_TOTAL threshold
df <- df %>% mutate(HW_Group = ifelse(HW_TOTAL > 8, "Above 8", "Below 8"))
# ===================================
# Numeric Variable Boxplots by HW_Group
# ===================================
for (var in numeric_columns) {
p <- ggplot(df, aes(x = HW_Group, y = .data[[var]], fill = HW_Group)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.1, alpha = 0.5) +
labs(title = paste("Comparison of", var, "by HWISE score group"),
x = "HW Group", y = var) +
theme_minimal() +
theme(legend.position = "none")
print(p)
}
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 51 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 51 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ===================================
# Categorical & Discrete Variable Bar Plots (Using Percentages)
# ===================================
cat_discrete_vars <- c(cat_columns, discrete_columns) # Include discrete variables
for (var in cat_discrete_vars) {
p_cat <- df %>%
group_by(HW_Group, .data[[var]]) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(HW_Group) %>%
mutate(percent = count / sum(count)) %>%
ggplot(aes(x = .data[[var]], y = percent, fill = HW_Group)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(labels = percent_format(), limits = c(0,1)) + # Format y-axis as percentages
labs(title = paste("Percentage Distribution of", var, "by HWISE score group"),
x = var, y = "Percentage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_cat)
}




























## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

# Load necessary libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(scales) # For percentage formatting
# Load the dataset
file_path <- "/Users/palomacz/Documents/GitHub/QUAIL-Mex/data/Cleaned_Dataset_Screening_HWISE_PSS.csv"
df <- read.csv(file_path, stringsAsFactors = FALSE, na.strings = c("", "N/A", "NA", "pending"))
# Identify discrete variables: all columns starting with "HW" or "PSS"
discrete_columns <- grep("^HW|^PSS", names(df), value = TRUE)
# Convert all discrete variables to factors
df <- df %>% mutate(across(all_of(discrete_columns), as.factor))
# Identify categorical variables (columns ending in "CAT") and convert them to factors
cat_columns <- grep("CAT$", names(df), value = TRUE)
df <- df %>% mutate(across(all_of(cat_columns), as.factor))
# Identify numeric variables (excluding ID, HW_TOTAL, and discrete variables)
numeric_columns <- df %>%
select(-all_of("ID")) %>%
select(-all_of(discrete_columns)) %>%
select(where(is.numeric)) %>%
names()
# Ensure HW_TOTAL is numeric
df$HW_TOTAL <- as.numeric(df$HW_TOTAL)
# Define a categorical variable for HW_TOTAL threshold (Above/Below 12)
df <- df %>% mutate(HW_Group = ifelse(HW_TOTAL > 12, "Above 12", "Below 12"))
# ===================================
# Categorical & Discrete Variable Bar Plots (Using Percentages)
# ===================================
cat_discrete_vars <- c(cat_columns, discrete_columns) # Include discrete variables
for (var in cat_discrete_vars) {
p_cat <- df %>%
group_by(HW_Group, .data[[var]]) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(HW_Group) %>%
mutate(percent = count / sum(count)) %>%
ggplot(aes(x = .data[[var]], y = percent, fill = HW_Group)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(limits = c(0, 1), labels = percent_format()) + # Set y-axis from 0 to 1
labs(title = paste("Percentage Distribution of", var, "by HWISE score group"),
x = var, y = "Percentage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_cat)
}




























## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

# Identify numeric columns (excluding ID and HW_TOTAL)
numeric_columns <- df %>%
select(where(is.numeric)) %>%
select(-ID) %>%
names()
# Remove numeric columns that contain only NA values
numeric_columns <- numeric_columns[sapply(df[numeric_columns], function(x) !all(is.na(x)))]
# Create histograms for each numeric variable with shared y-axis
for (var in numeric_columns) {
# Get the max count across HW Groups to set consistent y-axis limits
max_count <- df %>%
group_by(HW_Group) %>%
summarise(count = sum(!is.na(.data[[var]])), .groups = "drop") %>%
pull(count) %>%
max(na.rm = TRUE)
p <- ggplot(df, aes(x = .data[[var]], fill = HW_Group)) +
geom_histogram(binwidth = 2, color = "black", alpha = 0.7, position = "identity") +
facet_wrap(~HW_Group, scales = "free_x") + # Free x-axis but fixed y-axis
ylim(0, max_count) + # Set consistent y-axis limits
labs(title = paste("Distribution of", var, "by HW_TOTAL Group"),
x = var,
y = "Count") +
scale_fill_manual(values = c("Above 8" = "blue", "Below 8" = "red")) +
theme_minimal()
print(p) # Print each histogram
}
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 51 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
